In [268]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
In [269]:
df = pd.read_csv(r"C:\UNCC Freshman Year\education_dataset (1).csv")
df = pd.DataFrame(df)
In [270]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 student_id 2000 non-null int64 1 age 2000 non-null int64 2 study_hours_per_week 2000 non-null float64 3 attendance_rate 2000 non-null float64 4 homework_completion 2000 non-null float64 5 reading_score 2000 non-null float64 6 math_score 2000 non-null float64 7 science_score 2000 non-null float64 8 parent_education_level 2000 non-null object 9 family_income 2000 non-null int64 10 student_ethnicity 2000 non-null object 11 disability_status 2000 non-null object 12 tutoring_support 2000 non-null object 13 internet_access 2000 non-null object 14 school_type 2000 non-null object 15 region 2000 non-null object 16 social_media_hours 2000 non-null float64 17 gaming_hours 2000 non-null float64 18 num_siblings 2000 non-null int64 19 locker_number 2000 non-null object 20 bus_arrival_time 2000 non-null int64 21 favorite_subject 2000 non-null object 22 final_exam_score 2000 non-null float64 23 passed_course 2000 non-null object 24 college_admission 2000 non-null object 25 average_score 2000 non-null float64 dtypes: float64(10), int64(5), object(11) memory usage: 406.4+ KB
In [271]:
# First 5 rows of the dataframe
df.head()
Out[271]:
| student_id | age | study_hours_per_week | attendance_rate | homework_completion | reading_score | math_score | science_score | parent_education_level | family_income | ... | social_media_hours | gaming_hours | num_siblings | locker_number | bus_arrival_time | favorite_subject | final_exam_score | passed_course | college_admission | average_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 17 | 12.9 | 79.1 | 66.9 | 73.6 | 51.6 | 82.6 | college | 69815 | ... | 3.1 | 1.7 | 2 | A196 | 6 | history | 47.9 | no | rejected | 69.266667 |
| 1 | 2 | 18 | 16.6 | 97.2 | 78.2 | 70.6 | 56.9 | 80.7 | graduate | 76934 | ... | 4.2 | 1.9 | 3 | A289 | 8 | math | 58.9 | no | rejected | 69.400000 |
| 2 | 3 | 16 | 8.3 | 73.3 | 60.2 | 79.1 | 77.7 | 70.1 | college | 56996 | ... | 1.4 | 2.3 | 1 | A959 | 7 | science | 60.3 | yes | rejected | 75.633333 |
| 3 | 4 | 18 | 5.9 | 78.6 | 87.1 | 70.7 | 70.8 | 70.4 | graduate | 89833 | ... | 5.2 | 2.7 | 1 | A294 | 6 | math | 63.0 | yes | rejected | 70.633333 |
| 4 | 5 | 18 | 9.7 | 60.5 | 52.6 | 90.6 | 69.2 | 97.8 | graduate | 51160 | ... | 2.8 | 2.9 | 0 | A433 | 7 | english | 70.7 | yes | rejected | 85.866667 |
5 rows × 26 columns
In [272]:
df.describe()
Out[272]:
| student_id | age | study_hours_per_week | attendance_rate | homework_completion | reading_score | math_score | science_score | family_income | social_media_hours | gaming_hours | num_siblings | bus_arrival_time | final_exam_score | average_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.00000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.0000 | 2000.000000 | 2000.000000 | 2000.000000 |
| mean | 1000.500000 | 15.991500 | 10.044950 | 79.815800 | 74.42150 | 69.900950 | 68.012300 | 71.870950 | 60424.172500 | 3.025050 | 2.000750 | 1.5080 | 7.015000 | 57.456300 | 69.928067 |
| std | 577.494589 | 1.430892 | 2.975495 | 11.495014 | 14.37666 | 10.514535 | 11.579182 | 10.881476 | 15225.326072 | 1.507197 | 1.201256 | 1.1084 | 0.818603 | 7.366265 | 6.336277 |
| min | 1.000000 | 14.000000 | 0.900000 | 60.000000 | 50.00000 | 31.600000 | 20.900000 | 34.900000 | 5472.000000 | -1.700000 | -1.700000 | 0.0000 | 6.000000 | 31.800000 | 50.200000 |
| 25% | 500.750000 | 15.000000 | 8.100000 | 69.875000 | 62.00000 | 62.700000 | 60.200000 | 64.700000 | 50076.250000 | 2.000000 | 1.200000 | 1.0000 | 6.000000 | 52.400000 | 65.533333 |
| 50% | 1000.500000 | 16.000000 | 10.000000 | 79.600000 | 74.20000 | 69.800000 | 68.250000 | 71.800000 | 60537.500000 | 3.000000 | 2.000000 | 1.0000 | 7.000000 | 57.400000 | 69.800000 |
| 75% | 1500.250000 | 17.000000 | 12.000000 | 89.600000 | 86.80000 | 77.100000 | 75.700000 | 79.125000 | 70756.750000 | 4.100000 | 2.800000 | 2.0000 | 8.000000 | 62.525000 | 74.100000 |
| max | 2000.000000 | 18.000000 | 21.800000 | 100.000000 | 100.00000 | 105.300000 | 105.900000 | 109.200000 | 111434.000000 | 8.900000 | 6.600000 | 3.0000 | 8.000000 | 79.600000 | 91.233333 |
In [273]:
df.describe(include='all')
Out[273]:
| student_id | age | study_hours_per_week | attendance_rate | homework_completion | reading_score | math_score | science_score | parent_education_level | family_income | ... | social_media_hours | gaming_hours | num_siblings | locker_number | bus_arrival_time | favorite_subject | final_exam_score | passed_course | college_admission | average_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.00000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000 | 2000.000000 | ... | 2000.000000 | 2000.000000 | 2000.0000 | 2000 | 2000.000000 | 2000 | 2000.000000 | 2000 | 2000 | 2000.000000 |
| unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4 | NaN | ... | NaN | NaN | NaN | 801 | NaN | 5 | NaN | 2 | 2 | NaN |
| top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | high school | NaN | ... | NaN | NaN | NaN | A871 | NaN | english | NaN | no | rejected | NaN |
| freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 533 | NaN | ... | NaN | NaN | NaN | 9 | NaN | 418 | NaN | 1270 | 1984 | NaN |
| mean | 1000.500000 | 15.991500 | 10.044950 | 79.815800 | 74.42150 | 69.900950 | 68.012300 | 71.870950 | NaN | 60424.172500 | ... | 3.025050 | 2.000750 | 1.5080 | NaN | 7.015000 | NaN | 57.456300 | NaN | NaN | 69.928067 |
| std | 577.494589 | 1.430892 | 2.975495 | 11.495014 | 14.37666 | 10.514535 | 11.579182 | 10.881476 | NaN | 15225.326072 | ... | 1.507197 | 1.201256 | 1.1084 | NaN | 0.818603 | NaN | 7.366265 | NaN | NaN | 6.336277 |
| min | 1.000000 | 14.000000 | 0.900000 | 60.000000 | 50.00000 | 31.600000 | 20.900000 | 34.900000 | NaN | 5472.000000 | ... | -1.700000 | -1.700000 | 0.0000 | NaN | 6.000000 | NaN | 31.800000 | NaN | NaN | 50.200000 |
| 25% | 500.750000 | 15.000000 | 8.100000 | 69.875000 | 62.00000 | 62.700000 | 60.200000 | 64.700000 | NaN | 50076.250000 | ... | 2.000000 | 1.200000 | 1.0000 | NaN | 6.000000 | NaN | 52.400000 | NaN | NaN | 65.533333 |
| 50% | 1000.500000 | 16.000000 | 10.000000 | 79.600000 | 74.20000 | 69.800000 | 68.250000 | 71.800000 | NaN | 60537.500000 | ... | 3.000000 | 2.000000 | 1.0000 | NaN | 7.000000 | NaN | 57.400000 | NaN | NaN | 69.800000 |
| 75% | 1500.250000 | 17.000000 | 12.000000 | 89.600000 | 86.80000 | 77.100000 | 75.700000 | 79.125000 | NaN | 70756.750000 | ... | 4.100000 | 2.800000 | 2.0000 | NaN | 8.000000 | NaN | 62.525000 | NaN | NaN | 74.100000 |
| max | 2000.000000 | 18.000000 | 21.800000 | 100.000000 | 100.00000 | 105.300000 | 105.900000 | 109.200000 | NaN | 111434.000000 | ... | 8.900000 | 6.600000 | 3.0000 | NaN | 8.000000 | NaN | 79.600000 | NaN | NaN | 91.233333 |
11 rows × 26 columns
In [274]:
# Number of missing values for each variable, none have missing values
df.isnull().sum()
Out[274]:
student_id 0 age 0 study_hours_per_week 0 attendance_rate 0 homework_completion 0 reading_score 0 math_score 0 science_score 0 parent_education_level 0 family_income 0 student_ethnicity 0 disability_status 0 tutoring_support 0 internet_access 0 school_type 0 region 0 social_media_hours 0 gaming_hours 0 num_siblings 0 locker_number 0 bus_arrival_time 0 favorite_subject 0 final_exam_score 0 passed_course 0 college_admission 0 average_score 0 dtype: int64
In [275]:
df.shape
Out[275]:
(2000, 26)
In [276]:
df.size
Out[276]:
52000
In [277]:
# Number of different values; some of them have 2000 unique (direct identifiers)
# or near it (potenitally could combine to be indirect identifiers).
# Also determines how many groups needed for categorical variables.
df.nunique()
Out[277]:
student_id 2000 age 5 study_hours_per_week 166 attendance_rate 394 homework_completion 492 reading_score 465 math_score 495 science_score 486 parent_education_level 4 family_income 1957 student_ethnicity 5 disability_status 3 tutoring_support 3 internet_access 2 school_type 3 region 3 social_media_hours 91 gaming_hours 72 num_siblings 4 locker_number 801 bus_arrival_time 3 favorite_subject 5 final_exam_score 344 passed_course 2 college_admission 2 average_score 720 dtype: int64
In [278]:
dfnumerical = df[["age", "study_hours_per_week", "attendance_rate", "homework_completion", "reading_score", "math_score", "science_score", "family_income", "social_media_hours", "gaming_hours", "num_siblings", "final_exam_score"]]
dfnumerical
Out[278]:
| age | study_hours_per_week | attendance_rate | homework_completion | reading_score | math_score | science_score | family_income | social_media_hours | gaming_hours | num_siblings | final_exam_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17 | 12.9 | 79.1 | 66.9 | 73.6 | 51.6 | 82.6 | 69815 | 3.1 | 1.7 | 2 | 47.9 |
| 1 | 18 | 16.6 | 97.2 | 78.2 | 70.6 | 56.9 | 80.7 | 76934 | 4.2 | 1.9 | 3 | 58.9 |
| 2 | 16 | 8.3 | 73.3 | 60.2 | 79.1 | 77.7 | 70.1 | 56996 | 1.4 | 2.3 | 1 | 60.3 |
| 3 | 18 | 5.9 | 78.6 | 87.1 | 70.7 | 70.8 | 70.4 | 89833 | 5.2 | 2.7 | 1 | 63.0 |
| 4 | 18 | 9.7 | 60.5 | 52.6 | 90.6 | 69.2 | 97.8 | 51160 | 2.8 | 2.9 | 0 | 70.7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 14 | 12.1 | 68.2 | 86.5 | 60.8 | 65.3 | 69.8 | 31619 | 1.8 | 0.6 | 1 | 43.3 |
| 1996 | 18 | 10.9 | 88.4 | 52.6 | 72.5 | 72.0 | 77.9 | 57876 | 2.2 | 2.0 | 1 | 68.8 |
| 1997 | 14 | 7.0 | 60.8 | 94.2 | 45.6 | 61.2 | 61.5 | 86560 | 4.0 | 2.1 | 0 | 54.0 |
| 1998 | 18 | 7.5 | 76.8 | 93.1 | 68.2 | 69.7 | 62.7 | 42991 | 1.2 | 1.2 | 0 | 52.3 |
| 1999 | 16 | 13.7 | 91.0 | 72.2 | 67.9 | 74.9 | 72.7 | 61206 | 2.8 | 1.7 | 3 | 55.1 |
2000 rows × 12 columns
In [279]:
# Higher correlation between test scores and final exam scores than other variables.
sns.heatmap(dfnumerical.corr(), annot=True, cmap='coolwarm')
plt.show()
In [280]:
df.boxplot(column = "study_hours_per_week")
Out[280]:
<Axes: >
In [281]:
df.boxplot(column = "reading_score")
Out[281]:
<Axes: >
In [282]:
df.boxplot(column = "math_score")
Out[282]:
<Axes: >
In [283]:
df.boxplot(column = "science_score")
Out[283]:
<Axes: >
In [284]:
df.boxplot(column = "family_income")
Out[284]:
<Axes: >
In [285]:
df.boxplot(column = "social_media_hours")
Out[285]:
<Axes: >
In [286]:
df.boxplot(column = "gaming_hours")
Out[286]:
<Axes: >
In [287]:
df.boxplot(column = "num_siblings")
Out[287]:
<Axes: >
In [288]:
df.boxplot(column = "age")
Out[288]:
<Axes: >
In [289]:
df.boxplot(column = "final_exam_score")
Out[289]:
<Axes: >
In [290]:
df.plot.scatter(x="study_hours_per_week", y="final_exam_score")
Out[290]:
<Axes: xlabel='study_hours_per_week', ylabel='final_exam_score'>
In [291]:
df.plot.scatter(x="attendance_rate", y="final_exam_score")
Out[291]:
<Axes: xlabel='attendance_rate', ylabel='final_exam_score'>
In [292]:
df.plot.scatter(x="homework_completion", y="final_exam_score")
Out[292]:
<Axes: xlabel='homework_completion', ylabel='final_exam_score'>
In [293]:
df.plot.scatter(x="family_income", y="final_exam_score")
Out[293]:
<Axes: xlabel='family_income', ylabel='final_exam_score'>
In [294]:
df.plot.scatter(x="social_media_hours", y="final_exam_score")
Out[294]:
<Axes: xlabel='social_media_hours', ylabel='final_exam_score'>
In [295]:
df.plot.scatter(x="gaming_hours", y="math_score")
Out[295]:
<Axes: xlabel='gaming_hours', ylabel='math_score'>
In [296]:
df.plot.scatter(x="social_media_hours", y="gaming_hours")
Out[296]:
<Axes: xlabel='social_media_hours', ylabel='gaming_hours'>
In [297]:
df.plot.scatter(x="math_score", y="reading_score")
Out[297]:
<Axes: xlabel='math_score', ylabel='reading_score'>
In [298]:
df.plot.scatter(x="math_score", y="science_score")
Out[298]:
<Axes: xlabel='math_score', ylabel='science_score'>
In [299]:
df.plot.scatter(x="reading_score", y="science_score")
Out[299]:
<Axes: xlabel='reading_score', ylabel='science_score'>
In [300]:
# Of the variables, the most identifiable trend is one between reading_score, science_score, math_score
# and final_exam_score which seem to have a somewhat positive trend.
sns.pairplot(dfnumerical)
Out[300]:
<seaborn.axisgrid.PairGrid at 0x1d78e7f9f30>
In [301]:
dfcategorical = df[["student_id", "disability_status", "student_ethnicity", "tutoring_support", "school_type", "region", "parent_education_level", "internet_access", "college_admission", "passed_course","favorite_subject", "locker_number"]]
categoricals = df.select_dtypes(include="object").columns
In [302]:
# Internet access and college admission have highly imbalanced frequencies, which may create problematic minorities.
for col in categoricals:
sns.countplot(data=df, x=col)
plt.title(f"Barchart of {col}")
plt.xlabel(col)
plt.ylabel("Count")
plt.tight_layout()
plt.show()
In [303]:
df.plot.scatter(x="tutoring_support", y="final_exam_score")
Out[303]:
<Axes: xlabel='tutoring_support', ylabel='final_exam_score'>
In [304]:
df.plot.scatter(x="internet_access", y="final_exam_score")
Out[304]:
<Axes: xlabel='internet_access', ylabel='final_exam_score'>
In [305]:
df.plot.scatter(x="school_type", y="final_exam_score")
Out[305]:
<Axes: xlabel='school_type', ylabel='final_exam_score'>
In [306]:
df.plot.scatter(x="region", y="final_exam_score")
Out[306]:
<Axes: xlabel='region', ylabel='final_exam_score'>
In [307]:
df.plot.scatter(x="parent_education_level", y="final_exam_score")
Out[307]:
<Axes: xlabel='parent_education_level', ylabel='final_exam_score'>
In [308]:
df.plot.scatter(x="college_admission", y="final_exam_score")
Out[308]:
<Axes: xlabel='college_admission', ylabel='final_exam_score'>
In [309]:
df.plot.scatter(x="favorite_subject", y="final_exam_score")
Out[309]:
<Axes: xlabel='favorite_subject', ylabel='final_exam_score'>
In [310]:
dfnumerical = df[["age", "study_hours_per_week", "attendance_rate", "homework_completion", "reading_score", "math_score", "science_score", "family_income", "social_media_hours", "gaming_hours", "num_siblings", "final_exam_score"]]
dfnumerical
Out[310]:
| age | study_hours_per_week | attendance_rate | homework_completion | reading_score | math_score | science_score | family_income | social_media_hours | gaming_hours | num_siblings | final_exam_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17 | 12.9 | 79.1 | 66.9 | 73.6 | 51.6 | 82.6 | 69815 | 3.1 | 1.7 | 2 | 47.9 |
| 1 | 18 | 16.6 | 97.2 | 78.2 | 70.6 | 56.9 | 80.7 | 76934 | 4.2 | 1.9 | 3 | 58.9 |
| 2 | 16 | 8.3 | 73.3 | 60.2 | 79.1 | 77.7 | 70.1 | 56996 | 1.4 | 2.3 | 1 | 60.3 |
| 3 | 18 | 5.9 | 78.6 | 87.1 | 70.7 | 70.8 | 70.4 | 89833 | 5.2 | 2.7 | 1 | 63.0 |
| 4 | 18 | 9.7 | 60.5 | 52.6 | 90.6 | 69.2 | 97.8 | 51160 | 2.8 | 2.9 | 0 | 70.7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 14 | 12.1 | 68.2 | 86.5 | 60.8 | 65.3 | 69.8 | 31619 | 1.8 | 0.6 | 1 | 43.3 |
| 1996 | 18 | 10.9 | 88.4 | 52.6 | 72.5 | 72.0 | 77.9 | 57876 | 2.2 | 2.0 | 1 | 68.8 |
| 1997 | 14 | 7.0 | 60.8 | 94.2 | 45.6 | 61.2 | 61.5 | 86560 | 4.0 | 2.1 | 0 | 54.0 |
| 1998 | 18 | 7.5 | 76.8 | 93.1 | 68.2 | 69.7 | 62.7 | 42991 | 1.2 | 1.2 | 0 | 52.3 |
| 1999 | 16 | 13.7 | 91.0 | 72.2 | 67.9 | 74.9 | 72.7 | 61206 | 2.8 | 1.7 | 3 | 55.1 |
2000 rows × 12 columns
In [311]:
dfcategorical = df[["student_id", "disability_status", "student_ethnicity", "tutoring_support", "school_type", "region", "parent_education_level", "internet_access", "college_admission", "passed_course","favorite_subject", "locker_number"]]
categoricals = df.select_dtypes(include="object").columns
In [312]:
group_ats = df[df['tutoring_support'] == "none"]["final_exam_score"]
group_bts = df[df['tutoring_support'] == "private"]["final_exam_score"]
group_cts = df[df['tutoring_support'] == "school"]["final_exam_score"]
f_stat, p_value = stats.f_oneway(group_ats, group_bts, group_cts)
print(f"F-statistic: {f_stat}")
print(f"P-value: {p_value}")
F-statistic: 1.0391301792138739 P-value: 0.3539534586488434
In [313]:
group_aia = df[df['internet_access'] == "yes"]["final_exam_score"]
group_bia = df[df['internet_access'] == "no"]["final_exam_score"]
f_stat2, p_value2 = stats.f_oneway(group_aia, group_bia)
print(f"F-statistic: {f_stat2}")
print(f"P-value: {p_value2}")
F-statistic: 0.15621909184828248 P-value: 0.6927038283601827
In [314]:
variable1 = df['study_hours_per_week']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.04388597139243337
In [315]:
variable1 = df['attendance_rate']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.029180875516787795
In [316]:
variable1 = df['homework_completion']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.007697737671186041
In [317]:
variable1 = df['reading_score']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.43939412790683136
In [318]:
variable1 = df['math_score']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.4674740308729188
In [319]:
variable1 = df['science_score']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.32962909455106154
In [320]:
variable1 = df['average_score']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.7165008031621214
In [321]:
variable1 = df['social_media_hours']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.010790465578582485
In [322]:
variable1 = df['gaming_hours']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.03487168383776866
In [323]:
variable1 = df['num_siblings']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.031964277957644775
In [324]:
variable1 = df['family_income']
variable2= df['final_exam_score']
correlation_matrix = np.corrcoef(variable1, variable2)
correlation_coefficient = correlation_matrix[0,1]
print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.012188220444990253
In [325]:
group_ra = df[df['region'] == "urban"]["final_exam_score"]
group_rb = df[df['region'] == "suburban"]["final_exam_score"]
group_rc = df[df['region'] == "rural"]["final_exam_score"]
f_stat2, p_value2 = stats.f_oneway(group_ra, group_rb, group_rc)
print(f"F-statistic: {f_stat2}")
print(f"P-value: {p_value2}")
F-statistic: 1.316765240207636 P-value: 0.2682334045302832
In [326]:
group_sta = df[df['school_type'] == "private"]["final_exam_score"]
group_stb = df[df['school_type'] == "public"]["final_exam_score"]
group_stc = df[df['school_type'] == "charter"]["final_exam_score"]
f_stat2, p_value2 = stats.f_oneway(group_sta, group_stb, group_stc)
print(f"F-statistic: {f_stat2}")
print(f"P-value: {p_value2}")
F-statistic: 0.3791471622121001 P-value: 0.684494139394759
In [327]:
rn = df[["reading_score", "science_score", "math_score", "study_hours_per_week"]]
rc = df[["tutoring_support", "region"]]
rn1 = df[["average_score", "study_hours_per_week"]]
rn2 = df[["reading_score", "math_score"]]
rn3 = df[["reading_score", "science_score"]]
rn4 = df[["science_score", "math_score"]]
rn5 = df[["reading_score", "study_hours_per_week"]]
rn6 = df[["science_score", "study_hours_per_week"]]
rn7 = df[["math_score", "study_hours_per_week"]]
In [328]:
sns.heatmap(rn.corr(), annot=True, cmap='coolwarm')
plt.show()
In [329]:
# Combining reading_score, science_score, and math_score because of high VIF.
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
vif_data = pd.DataFrame()
vif_data["feature"] = rn.columns
vif_data["VIF"] = [variance_inflation_factor(rn.values, i) for i in range(rn.shape[1])]
print(vif_data)
feature VIF 0 reading_score 30.175144 1 science_score 29.539610 2 math_score 26.918671 3 study_hours_per_week 11.383586
In [330]:
# Removing study_hours_per_week to reduce multicollinearity.
vif_data = pd.DataFrame()
vif_data["feature"] = rn1.columns
vif_data["VIF"] = [variance_inflation_factor(rn1.values, i) for i in range(rn1.shape[1])]
print(vif_data)
feature VIF 0 average_score 11.379271 1 study_hours_per_week 11.379271
In [331]:
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(df["tutoring_support"], df["region"])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print(f"Expected frequencies:\n{expected}")
Chi-square statistic: 2.837654398960902 P-value: 0.5853505355723624 Degrees of freedom: 4 Expected frequencies: [[228.942 244.2285 237.8295] [212.52 226.71 220.77 ] [202.538 216.0615 210.4005]]
In [332]:
df["final_score"] = df[["reading_score", "math_score", "science_score"]].mean(axis=1)
df.head()
Out[332]:
| student_id | age | study_hours_per_week | attendance_rate | homework_completion | reading_score | math_score | science_score | parent_education_level | family_income | ... | gaming_hours | num_siblings | locker_number | bus_arrival_time | favorite_subject | final_exam_score | passed_course | college_admission | average_score | final_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 17 | 12.9 | 79.1 | 66.9 | 73.6 | 51.6 | 82.6 | college | 69815 | ... | 1.7 | 2 | A196 | 6 | history | 47.9 | no | rejected | 69.266667 | 69.266667 |
| 1 | 2 | 18 | 16.6 | 97.2 | 78.2 | 70.6 | 56.9 | 80.7 | graduate | 76934 | ... | 1.9 | 3 | A289 | 8 | math | 58.9 | no | rejected | 69.400000 | 69.400000 |
| 2 | 3 | 16 | 8.3 | 73.3 | 60.2 | 79.1 | 77.7 | 70.1 | college | 56996 | ... | 2.3 | 1 | A959 | 7 | science | 60.3 | yes | rejected | 75.633333 | 75.633333 |
| 3 | 4 | 18 | 5.9 | 78.6 | 87.1 | 70.7 | 70.8 | 70.4 | graduate | 89833 | ... | 2.7 | 1 | A294 | 6 | math | 63.0 | yes | rejected | 70.633333 | 70.633333 |
| 4 | 5 | 18 | 9.7 | 60.5 | 52.6 | 90.6 | 69.2 | 97.8 | graduate | 51160 | ... | 2.9 | 0 | A433 | 7 | english | 70.7 | yes | rejected | 85.866667 | 85.866667 |
5 rows × 27 columns
In [333]:
numeric_cols_all = df.select_dtypes(include=np.number).columns.tolist()
# We only want these numeric columns:
numeric_keep = ["math_score", "reading_score", "science_score"]
# Build df2 with EXACT columns requested
df2 = df[["final_score", "tutoring_support", "region"] + numeric_keep]
# Remove duplicates if any
df2 = df2.loc[:, ~df2.columns.duplicated()]
# Define categorical and numeric columns
categorical_cols = ["tutoring_support", "region"]
numeric_cols = ["math_score", "reading_score", "science_score"]
df2.head()
Out[333]:
| final_score | tutoring_support | region | math_score | reading_score | science_score | |
|---|---|---|---|---|---|---|
| 0 | 69.266667 | school | urban | 51.6 | 73.6 | 82.6 |
| 1 | 69.400000 | none | suburban | 56.9 | 70.6 | 80.7 |
| 2 | 75.633333 | school | urban | 77.7 | 79.1 | 70.1 |
| 3 | 70.633333 | none | rural | 70.8 | 70.7 | 70.4 |
| 4 | 85.866667 | private | suburban | 69.2 | 90.6 | 97.8 |
In [334]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_keep = ["math_score", "reading_score", "science_score"]
df2 = df[["final_score", "tutoring_support", "region"] + numeric_keep]
# Remove duplicated columns if any
df2 = df2.loc[:, ~df2.columns.duplicated()]
# Define categorical and numeric columns
categorical_cols = ["tutoring_support", "region"]
numeric_cols = ["math_score", "reading_score", "science_score"]
In [335]:
preprocessor = ColumnTransformer(
transformers=[
("num", StandardScaler(), numeric_cols),
("cat", OneHotEncoder(drop='first'), categorical_cols)
]
)
# Fit and transform X (all predictors)
X = df2.drop(columns=["final_score"])
y = df2["final_score"]
X_processed = preprocessor.fit_transform(X)
# Get names of encoded categorical variables
encoded_cat_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
# Construct final feature names
final_feature_names = numeric_cols + list(encoded_cat_names)
# Build DataFrame
df2_processed = pd.DataFrame(X_processed, columns=final_feature_names)
df2_processed["final_score"] = y.values
df2_processed["average_score"] = df2_processed[["math_score", "reading_score", "science_score"]].mean(axis=1)
df2_processed.head()
Out[335]:
| math_score | reading_score | science_score | tutoring_support_private | tutoring_support_school | region_suburban | region_urban | final_score | average_score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.417752 | 0.351891 | 0.986239 | 0.0 | 1.0 | 0.0 | 1.0 | 69.266667 | -0.026540 |
| 1 | -0.959919 | 0.066501 | 0.811586 | 0.0 | 0.0 | 1.0 | 0.0 | 69.400000 | -0.027277 |
| 2 | 0.836857 | 0.875108 | -0.162790 | 0.0 | 1.0 | 0.0 | 1.0 | 75.633333 | 0.516392 |
| 3 | 0.240811 | 0.076014 | -0.135213 | 0.0 | 0.0 | 0.0 | 0.0 | 70.633333 | 0.060537 |
| 4 | 0.102598 | 1.969105 | 2.383457 | 1.0 | 0.0 | 1.0 | 0.0 | 85.866667 | 1.485053 |
In [336]:
df_model = df2_processed[
[
"final_score",
"average_score",
"tutoring_support_private",
"tutoring_support_school",
"region_suburban",
"region_urban",
]
].copy()
df_model
Out[336]:
| final_score | average_score | tutoring_support_private | tutoring_support_school | region_suburban | region_urban | |
|---|---|---|---|---|---|---|
| 0 | 69.266667 | -0.026540 | 0.0 | 1.0 | 0.0 | 1.0 |
| 1 | 69.400000 | -0.027277 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 75.633333 | 0.516392 | 0.0 | 1.0 | 0.0 | 1.0 |
| 3 | 70.633333 | 0.060537 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 85.866667 | 1.485053 | 1.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 1995 | 65.300000 | -0.430147 | 1.0 | 0.0 | 1.0 | 0.0 |
| 1996 | 74.133333 | 0.381975 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1997 | 56.100000 | -1.284515 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1998 | 66.866667 | -0.286346 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1999 | 71.833333 | 0.160280 | 1.0 | 0.0 | 0.0 | 0.0 |
2000 rows × 6 columns
In [337]:
df_model_y = df_model["final_score"]
df_model_X = df_model.drop(columns=["final_score"])
In [338]:
# We averaged performance metrics and coefficients across the folds to see how the model would perform on new data.
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import KFold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
r_squared_scores = []
rmse_scores = []
GME_urban = []
GME_suburban = []
GME_rural = []
meanprediction_urban = []
meanprediction_suburban = []
meanprediction_rural = []
intercept = []
coefficient1 = []
coefficient2 = []
coefficient3 = []
coefficient4 = []
coefficient5 = []
coefficient6 = []
fold = 1
for train_index, test_index in kf.split(df_model):
train_data = df_model.iloc[train_index]
xtest_data, ytest_data = df_model_X.iloc[test_index], df_model_y.iloc[test_index]
# Fit the OLS model on the training data
model = smf.ols(formula= """
final_score ~ average_score
+ tutoring_support_private
+ tutoring_support_school
+ region_suburban
+ region_urban
""", data=train_data).fit()
predictions = model.predict(xtest_data)
residuals = ytest_data - predictions
coefficients = model.params
intercept.append(coefficients[0])
coefficient1.append(coefficients[1])
coefficient2.append(coefficients[2])
coefficient3.append(coefficients[3])
coefficient4.append(coefficients[4])
coefficient5.append(coefficients[5])
r_squared = model.rsquared
rmse = np.sqrt(np.mean((ytest_data - predictions)**2))
r_squared_scores.append(r_squared)
rmse_scores.append(rmse)
#mean_residual_by_group = residuals2.groupby('Region')['residuals'].mean()
#GME_urban.append(mean_residual_by_group.get('urban', 0))
#GME_suburban.append(mean_residual_by_group.get('suburban', 0))
#GME_rural.append(mean_residual_by_group.get('rural', 0))
#mean_predictions_by_group = predictions2.groupby('Region')['predictions'].mean()
#meanprediction_urban.append(mean_predictions_by_group.get('urban', 0))
#meanprediction_suburban.append(mean_predictions_by_group.get('suburban', 0))
#meanprediction_rural.append(mean_predictions_by_group.get('rural', 0))
reset_test = sm.stats.linear_reset(model, use_f=True)
print(f"\nRamsey RESET test {fold} (F-test): {reset_test}")
plt.figure(figsize=(6, 4))
plt.scatter(model.fittedvalues, model.resid, alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted Values")
plt.tight_layout()
plt.show()
influence = model.get_influence()
cooks_d = influence.cooks_distance
plt.figure(figsize=(10, 6))
plt.stem(cooks_d[0], markerfmt=",")
plt.title("Cook's Distance Plot")
plt.xlabel("Observation Index")
plt.ylabel("Cook's Distance")
plt.show()
# print(model.summary())
fold += 1
print(f"Mean R-squared across folds: {np.mean(r_squared_scores):.4f}")
print(f"Mean MSE across folds: {np.mean(rmse_scores):.4f}")
#print("Average Coefficient for tutoring_support_none:", round(np.mean(coefficient1), 4))
print("Average Intercept:", round(np.mean(intercept), 4))
print("Average Coefficient for average_score:", round(np.mean(coefficient1), 4))
print("Average Coefficient for tutoring_support_school:", round(np.mean(coefficient2), 4))
print("Average Coefficient for tutoring_support_private:", round(np.mean(coefficient3), 4))
print("Average Coefficient for region_urban:", round(np.mean(coefficient4), 4))
print("Average Coefficient for region_suburban:", round(np.mean(coefficient5), 4))
#print("Average Coefficient for region_rural:", round(np.mean(coefficient6), 4))
#print("Average GME for urban:", round(np.mean(GME_urban), 4))
#print("Average GME for suburban:", round(np.mean(GME_suburban), 4))
#print("Average GME for rural:", round(np.mean(GME_rural), 4))
#print("Average prediction for urban:", round(np.mean(meanprediction_urban), 4))
#print("Average prediction for suburban:", round(np.mean(meanprediction_suburban), 4))
#print("Average prediction for rural:", round(np.mean(meanprediction_rural), 4))
Ramsey RESET test 1 (F-test): <F test: F=1.48186788283807, p=0.2275261191286845, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` intercept.append(coefficients[0]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient1.append(coefficients[1]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient2.append(coefficients[2]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient3.append(coefficients[3]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient4.append(coefficients[4]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient5.append(coefficients[5])
Ramsey RESET test 2 (F-test): <F test: F=0.32087908966718337, p=0.7255578791515537, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` intercept.append(coefficients[0]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient1.append(coefficients[1]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient2.append(coefficients[2]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient3.append(coefficients[3]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient4.append(coefficients[4]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient5.append(coefficients[5])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` intercept.append(coefficients[0]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient1.append(coefficients[1]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient2.append(coefficients[2]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient3.append(coefficients[3]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient4.append(coefficients[4]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient5.append(coefficients[5])
Ramsey RESET test 3 (F-test): <F test: F=0.6544175872783571, p=0.5198844385835868, df_denom=1.59e+03, df_num=2>
Ramsey RESET test 4 (F-test): <F test: F=2.183742928154264, p=0.11295644693303862, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` intercept.append(coefficients[0]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient1.append(coefficients[1]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient2.append(coefficients[2]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient3.append(coefficients[3]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient4.append(coefficients[4]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient5.append(coefficients[5])
Ramsey RESET test 5 (F-test): <F test: F=0.5768955468899213, p=0.561756612186604, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` intercept.append(coefficients[0]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient1.append(coefficients[1]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient2.append(coefficients[2]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient3.append(coefficients[3]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient4.append(coefficients[4]) C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` coefficient5.append(coefficients[5])
Mean R-squared across folds: 0.9984 Mean MSE across folds: 0.2538 Average Intercept: 69.9285 Average Coefficient for average_score: 10.9929 Average Coefficient for tutoring_support_school: -0.0025 Average Coefficient for tutoring_support_private: -0.0044 Average Coefficient for region_urban: -0.0052 Average Coefficient for region_suburban: 0.0105
In [339]:
df_model2 = df_model.drop(columns=["final_score"])
sns.heatmap(df_model2.corr(), annot=True, cmap='coolwarm')
plt.show()